Note that I also ran the EMS statistics in two separate turnover scripts. The output from these will be loaded here.
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colMeans,
## colnames, colSums, do.call, duplicated, eval, evalq, Filter,
## Find, get, grep, grepl, intersect, is.unsorted, lapply,
## lengths, Map, mapply, match, mget, order, paste, pmax,
## pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
## rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
## tapply, union, unique, unsplit, which, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:phyloseq':
##
## distance
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:phyloseq':
##
## sampleNames
## Loading required package: DelayedArray
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## The following objects are masked from 'package:genefilter':
##
## rowSds, rowVars
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:base':
##
## apply
discovery = readRDS("~/Dropbox/Periodontology2000/katie_biogeo_tree.RDS")
discovery
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 1679 taxa and 84 samples ]
## sample_data() Sample Data: [ 84 samples by 61 sample variables ]
## tax_table() Taxonomy Table: [ 1679 taxa by 6 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 1679 tips and 1677 internal nodes ]
#how many subjects
levels(sample_data(discovery)$Subject)
## [1] "72" "73" "XC3"
#what's the median sequencing depth
summary(sample_sums(discovery))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2 22145 27413 28717 35102 75977
decontam.phy=readRDS("~/Desktop/Periodontol2000/periodontol_moresubjects.RDS")
decontam.phy = subset_samples(decontam.phy, Subject !="Control")
#how many samples are in the validation dataset?
nsamples(decontam.phy)
## [1] 663
#how many subjects are in the validation dataset?
length(levels(sample_data(decontam.phy)$Subject))
## [1] 8
#how many healthy aim 1 subjects are in the validation dataset?
aim1 = subset_samples(decontam.phy, Aim=="SA1" | Aim=="Pilot")
length(levels(sample_data(aim1)$Subject))
## [1] 5
#how many healthy aim 3 subjects are in the validation dataset?
aim3 = subset_samples(decontam.phy, Aim=="SA3")
length(levels(sample_data(aim3)$Subject))
## [1] 3
#how many subgingival samples
sub = subset_samples(decontam.phy, Habitat_Class=="Sub")
sub = prune_taxa(taxa_sums(sub) > 0, sub)
sub = prune_samples(sample_sums(sub) > 0, sub)
#what subgingival samples were collected
#what sample sites were sampled
sub = subset_samples(decontam.phy, Habitat_Class=="Sub")
levels(sample_data(sub)$Tooth_Number)
## [1] "11" "14" "19" "22" "24" "25" "27" "3" "30" "6" "8" "9"
#how many supragingival samples
supra = subset_samples(decontam.phy, Habitat_Class=="Supra")
#how many ASVs are there
decontam.phy
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 625 taxa and 663 samples ]
## sample_data() Sample Data: [ 663 samples by 30 sample variables ]
## tax_table() Taxonomy Table: [ 625 taxa by 7 taxonomic ranks ]
#write a csv and contruct a highest taxonomic rank degination in the taxa table
df = data.frame(tax_table(decontam.phy))
#write.csv(df, file="periodontology2000_tax.csv")
newtax= read.csv("~/Desktop/Projects/Stanford/RelmanLab/Hyposalivation/Periodontol2000/periodontology2000_tax.csv", row.names = 1)
newtax = as.matrix(newtax)
newtax= tax_table(newtax)
tax_table(decontam.phy)= newtax
#replae taxa names with the highest label
df=data.frame(tax_table(decontam.phy))
taxa_names(decontam.phy)=df$Highest.Rank
#what is the median sequencing depth of the validation dataset?
summary(sample_sums(decontam.phy))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 27408 53636 52310 76684 133778
df = data.frame(sample_sums(decontam.phy), sample_data(decontam.phy))
colnames(df)[1] = "depth"
p = ggplot(df, aes(depth, color=Habitat_Class)) + geom_density()
p
mysum = doBy::summaryBy(depth~Habitat_Class, df, FUN=c(median, mean, length, sum, min, max))
mysum
## Habitat_Class depth.median depth.mean depth.length depth.sum depth.min
## 1 Supra 70382.5 62679.78 440 27579102 1
## 2 Sub 30174.0 31848.26 223 7102161 4345
## depth.max
## 1 133778
## 2 80403
#how many unique phyla are found?
length(get_taxa_unique(decontam.phy, taxonomic.rank="Phylum"))
## [1] 15
#What is the relative abundance of each Phylum?
library(doBy)
phy <- tax_glom(decontam.phy, taxrank="Phylum")
tax.count <- data.frame(taxa_sums(phy),tax_table(phy)[,2])
rownames(tax.count)= NULL
colnames(tax.count)[1] <- c("Abundance")
tax.count$Percent <- round(tax.count$Abundance/sum(tax.count$Abundance)*100, 4)
library(plyr)
##
## Attaching package: 'plyr'
## The following object is masked from 'package:matrixStats':
##
## count
## The following object is masked from 'package:IRanges':
##
## desc
## The following object is masked from 'package:S4Vectors':
##
## rename
Phylum_df <- arrange(tax.count, desc(Percent))
colnames(Phylum_df) = c("Total.Abundance", "Phylum", "Total.Percent")
#how much do the top 5 phyla contribute to total abundance?
top5 <- Phylum_df[1:5, ]
round(sum(top5$Percent),2)
## [1] 0
#What are the top 5 Phyla?
top5
## Total.Abundance Phylum Total.Percent
## 1 14981045 Firmicutes 43.2156
## 2 7412528 Proteobacteria 21.3828
## 3 6535861 Actinobacteria 18.8539
## 4 3810015 Bacteroidetes 10.9907
## 5 1695931 Fusobacteria 4.8922
##################what about the subgingival samples
subgingival.phy = subset_samples(decontam.phy, Habitat_Class=="Sub")
phy <- tax_glom(subgingival.phy, taxrank="Phylum")
tax.count <- data.frame(taxa_sums(phy),tax_table(phy)[,2])
rownames(tax.count)= NULL
colnames(tax.count)[1] <- c("Abundance")
tax.count$Percent <- round(tax.count$Abundance/sum(tax.count$Abundance)*100, 4)
Phylum_sub <- arrange(tax.count, desc(Percent))
Phylum_sub <- subset(Phylum_sub, Abundance > 0)
colnames(Phylum_sub) = c("Sub.Abundance", "Phylum" ,"Sub.Percent")
##################what about the supragingival samples
supragingival.phy = subset_samples(decontam.phy, Habitat_Class=="Supra")
phy <- tax_glom(supragingival.phy, taxrank="Phylum")
tax.count <- data.frame(taxa_sums(phy),tax_table(phy)[,2])
rownames(tax.count)= NULL
colnames(tax.count)[1] <- c("Abundance")
tax.count$Percent <- round(tax.count$Abundance/sum(tax.count$Abundance)*100, 4)
Phylum_supra <- arrange(tax.count, desc(Percent))
Phylum_suora <- subset(Phylum_supra, Abundance > 0)
colnames(Phylum_supra) = c("Supra.Abundance", "Phylum" ,"Supra.Percent")
#plot the phylum level differenes
df = join(Phylum_sub, Phylum_supra, by="Phylum")
df= join(df, Phylum_df, by="Phylum")
dfm = melt(df, id.vars = c("Sub.Abundance", "Supra.Abundance", "Phylum", "Total.Abundance"))
fig1a = ggplot(dfm, aes(variable, value, fill=Phylum)) + geom_bar(position="stack", stat="identity") +
scale_x_discrete(labels=c("Sub.Percent" = "Subgingival", "Supra.Percent" = "Supragingival", "Total.Percent" = "Totals")) +
xlab("Habitat Class") + ylab("Percent Relative Abundance")
#test differences using proportion test
testme = as.matrix(cbind(df[,3], df[,5]))
prop.test(testme)
## Warning in prop.test(testme): Chi-squared approximation may be incorrect
##
## 8-sample test for equality of proportions without continuity
## correction
##
## data: testme
## X-squared = 50.353, df = 7, p-value = 1.232e-08
## alternative hypothesis: two.sided
## sample estimates:
## prop 1 prop 2 prop 3 prop 4 prop 5 prop 6 prop 7
## 0.8234193 0.5609600 0.2510159 0.8861476 0.3021838 0.9844929 0.9387453
## prop 8
## 0.1086957
#What is the number of Genera found?
length(get_taxa_unique(decontam.phy, taxonomic.rank="Genus"))
## [1] 146
#what are the abundance levels of each genus?
supra.genus <- tax_glom(decontam.phy, taxrank="Genus")
tax.count <- data.frame(tax_table(supra.genus)[,2:7], taxa_sums(supra.genus))
rownames(tax.count) = NULL
colnames(tax.count) <- c("Phylum","Class","Order","Family","Genus", "Species", "Abundance")
tax.count$Percent <- round(tax.count$Abundance/sum(tax.count$Abundance)*100, 4)
library(plyr)
Genus_df <- arrange(tax.count, desc(Percent))
Genus_df <- Genus_df[order(Genus_df$Percent, decreasing=TRUE), ]
#how much do the top 10 genera contribute to total abundance?
top10 <- Genus_df[1:10, ]
round(sum(top10$Percent),3)
## [1] 85.584
#what about the top 5
top5 <- Genus_df[1:5, ]
round(sum(top5$Percent),3)
## [1] 68.078
###How diverse are the top 10 genera? i.e., how many species are there per genus?
top10 <- as.vector(Genus_df$Genus[1:10])
Diversity.list <- vector("list", 10)
names(Diversity.list) <- top10
for(i in 1:length(top10)){
physub = subset_taxa(decontam.phy, Genus==top10[i])
physub = prune_taxa(taxa_sums(physub) > 0, physub)
Diversity.list[[i]] <- physub
}
#compute the number of taxa in each element of the list
Ntaxa <- data.frame(unlist(lapply(Diversity.list, ntaxa)))
colnames(Ntaxa) <- "N.Species"
#Make a table with percent abundance and number of taxa
genus.tab <- data.frame(Genus_df[1:10,], Ntaxa)
library(knitr)
kable(genus.tab, format="markdown")
| Phylum | Class | Order | Family | Genus | Species | Abundance | Percent | N.Species |
|---|---|---|---|---|---|---|---|---|
| Firmicutes | Bacilli | Lactobacillales | Streptococcaceae | Streptococcus | NA | 9749877 | 33.0018 | 10 |
| Actinobacteria | Actinobacteria | Actinomycetales | Micrococcaceae | Rothia | NA | 3747361 | 12.6842 | 5 |
| Firmicutes | Negativicutes | Selenomonadales | Veillonellaceae | Veillonella | NA | 2774858 | 9.3925 | 9 |
| Proteobacteria | Betaproteobacteria | Neisseriales | Neisseriaceae | Neisseria | NA | 1970888 | 6.6711 | 11 |
| Bacteroidetes | Bacteroidia | Bacteroidales | Prevotellaceae | Prevotella | NA | 1869542 | 6.3281 | 50 |
| Actinobacteria | Actinobacteria | Actinomycetales | Corynebacteriaceae | Corynebacterium | NA | 1372420 | 4.6454 | 4 |
| Actinobacteria | Actinobacteria | Actinomycetales | Actinomycetaceae | Actinomyces | NA | 1179575 | 3.9927 | 15 |
| Fusobacteria | Fusobacteriia | Fusobacteriales | Fusobacteriaceae | Fusobacterium | NA | 1092125 | 3.6967 | 22 |
| Firmicutes | Bacilli | Lactobacillales | Aerococcaceae | Abiotrophia | NA | 800269 | 2.7088 | 1 |
| Firmicutes | Bacilli | Bacillales | Bacillales_Incertae_Sedis_XI | Gemella | NA | 727480 | 2.4624 | 2 |
write.csv(genus.tab, file="TableS2.csv")
mi= subset_samples(decontam.phy, Tooth_Class %in% c("Incisor_Central", "Molar"))
mi = prune_samples(sample_sums(mi) > 0, mi)
subgingival.phy = subset_samples(mi, Habitat_Class=="Sub")
subgingival.phy = prune_taxa(taxa_sums(subgingival.phy) > 0, subgingival.phy)
subgingival.phy = prune_samples(sample_sums(subgingival.phy) > 0, subgingival.phy)
supragingival.phy = subset_samples(mi, Habitat_Class=="Supra")
supragingival.phy = prune_taxa(taxa_sums(supragingival.phy) > 0, supragingival.phy)
supragingival.phy = prune_samples(sample_sums(supragingival.phy) > 0, supragingival.phy)
meths = c("Shannon", "Chao1", "Simpson")
sites = levels(as.factor(sample_data(subgingival.phy)$Habitat))
sub.holder <-vector('list',length(sites))
supra.holder <- vector('list', length(sites))
prare.holder <- vector('list', length(sites))
brare.holder <- vector('list', length(sites))
for(i in length(sites))
{
# Estimate diversity and make data frame
erDF <- estimate_richness(subgingival.phy, split = TRUE, measures = meths)
df <- data.frame(erDF, sample_data(subgingival.phy))
sub.holder[[i]] <- df
}
## Warning in estimate_richness(subgingival.phy, split = TRUE, measures = meths): The data you have provided does not have
## any singletons. This is highly suspicious. Results of richness
## estimates (for example) are probably unreliable, or wrong, if you have already
## trimmed low-abundance taxa from the data.
##
## We recommended that you find the un-trimmed data and retry.
for(i in length(sites))
{
# Estimate diversity and make data frame
erDF <- estimate_richness(supragingival.phy, split = TRUE, measures = meths)
df <- data.frame(erDF, sample_data(supragingival.phy))
supra.holder[[i]] <- df
}
#rarefy the supragingival samples to
set.seed(191)
supra.rare = rarefy_even_depth(supragingival.phy, sample.size=20000, replace=FALSE)
## You set `rngseed` to FALSE. Make sure you've set & recorded
## the random seed of your session for reproducibility.
## See `?set.seed`
## ...
## 19 samples removedbecause they contained fewer reads than `sample.size`.
## Up to first five removed samples are:
## 1101C01109R001104C01109R001104C01224R001102C01109R001102C01219R00
## ...
## 15OTUs were removed because they are no longer
## present in any sample after random subsampling
## ...
for(i in length(sites))
{
# Estimate diversity and make data frame
erDF <- estimate_richness(supra.rare, split = TRUE, measures = meths)
df <- data.frame(erDF, sample_data(supra.rare))
prare.holder[[i]] <- df
}
#rarefy the supragingival samples to
set.seed(191)
sub.rare = rarefy_even_depth(subgingival.phy, sample.size=20000, replace=FALSE)
## You set `rngseed` to FALSE. Make sure you've set & recorded
## the random seed of your session for reproducibility.
## See `?set.seed`
##
## ...
## 29 samples removedbecause they contained fewer reads than `sample.size`.
## Up to first five removed samples are:
## 1102C01314R001102C01319R001102C01330R001106C01303R081106C01309R08
## ...
## 1OTUs were removed because they are no longer
## present in any sample after random subsampling
## ...
brare.holder <- vector('list', length(sites))
for(i in length(sites))
{
# Estimate diversity and make data frame
erDF <- estimate_richness(sub.rare, split = TRUE, measures = meths)
df <- data.frame(erDF, sample_data(sub.rare))
brare.holder[[i]] <- df
}
df1 = do.call("rbind", sub.holder)
df2 = do.call("rbind", supra.holder)
df3 = do.call("rbind", prare.holder)
df3$Habitat_Class = "Rarefied_Supra"
df4 = do.call("rbind", brare.holder)
df4$Habitat_Class = "Rarefied_Sub"
df = data.frame(rbind(df1, df2, df3, df4))
dfm1 = melt(df, id.vars=colnames(sample_data(supragingival.phy)))
dfm1= subset(dfm1, variable !="se.chao1")
p1 = ggplot(dfm1, aes(Habitat_Class, value)) + facet_wrap(~variable, scales="free_y", ncol=4) + geom_boxplot()
p1
#test to see whether alpha diversity varies based on tooth class
df = subset(df, Habitat_Class %in% c("Rarefied_Supra", "Rarefied_Sub" ))
wilcox.test(Chao1~Habitat_Class, data=df, exact=TRUE, conf.int=TRUE)
## Warning in wilcox.test.default(x = c(67, 56.3333333333333, 40, 56, 56,
## 47.25, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = c(67, 56.3333333333333, 40, 56, 56,
## 47.25, : cannot compute exact confidence intervals with ties
##
## Wilcoxon rank sum test with continuity correction
##
## data: Chao1 by Habitat_Class
## W = 14116, p-value = 3.642e-07
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 6.75005 14.33328
## sample estimates:
## difference in location
## 10.49998
wilcox.test(Shannon~Habitat_Class, data=df, exact=TRUE, conf.int=TRUE)
##
## Wilcoxon rank sum test
##
## data: Shannon by Habitat_Class
## W = 6761, p-value = 1.662e-07
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -0.4452602 -0.2030296
## sample estimates:
## difference in location
## -0.3244488
wilcox.test(Simpson~Habitat_Class, data=df, exact=TRUE, conf.int=TRUE)
##
## Wilcoxon rank sum test
##
## data: Simpson by Habitat_Class
## W = 6815, p-value = 2.536e-07
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -0.05794179 -0.02556212
## sample estimates:
## difference in location
## -0.04179579
library(RColorBrewer)
### Create an index tooth biogeography subset
teeth = c("3", "8", "9", "14", "19", "24", "25", "30")
index_subsites <- subset_samples(decontam.phy, Tooth_Number %in% teeth)
index_subsites
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 625 taxa and 279 samples ]
## sample_data() Sample Data: [ 279 samples by 30 sample variables ]
## tax_table() Taxonomy Table: [ 625 taxa by 10 taxonomic ranks ]
#how many samples are in average 2 subjects? Looks like about 25% of samples
summary(sample_data(index_subsites)$Subject)
## 1-101 1-102 1-104 1-105 1-106 3-301 3-302 3-303
## 29 33 32 32 57 32 32 32
#filter the taxa to include those present in 25% of samples
filtergroup = filterfun(kOverA(k=70, A=1)) #k = number of samples; A = abundance
index25 = filter_taxa(index_subsites, filtergroup, prune=TRUE)
index25 = prune_taxa(taxa_sums(index25) > 0, index25)
index25 = prune_samples(sample_sums(index25) > 0, index25)
index25
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 71 taxa and 279 samples ]
## sample_data() Sample Data: [ 279 samples by 30 sample variables ]
## tax_table() Taxonomy Table: [ 71 taxa by 10 taxonomic ranks ]
#convert all the data to relative abundance
indexRA = transform_sample_counts(index25, function(x) x / sum(x))
#Do PCOA
index.pcoa <- ordinate(indexRA, method="PCoA", distance="bray")
evals <- index.pcoa$values$Eigenvalues
### Use DESEq2 to identify taxa that are differentially abundant between subgingival and supragingival sites
diagdds = phyloseq_to_deseq2(index25, ~ Habitat_Class)
## converting counts to integer mode
# calculate geometric means prior to estimate size factors
gm_mean = function(x, na.rm=TRUE){
exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
}
geoMeans = apply(counts(diagdds), 1, gm_mean)
diagdds = estimateSizeFactors(diagdds, geoMeans = geoMeans)
diagdds = DESeq(diagdds, fitType="local")
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 29 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
res = results(diagdds, cooksCutoff = FALSE)
alpha = 0.05
sigtab = res[which(res$padj < alpha), ]
sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(index25)[rownames(sigtab), ], "matrix"))
posigtab = sigtab[sigtab[, "log2FoldChange"] > 5, ]
posigtab = posigtab[, c("baseMean", "log2FoldChange", "lfcSE", "padj", "Phylum", "Class", "Family", "Genus")]
library("ggplot2")
sigtabgen = subset(sigtab, !is.na(Genus))
# Phylum order
x = tapply(sigtabgen$log2FoldChange, sigtabgen$Phylum, function(x) max(x))
x = sort(x, TRUE)
sigtabgen$Phylum = factor(as.character(sigtabgen$Phylum), levels=names(x))
# Genus order
x = tapply(sigtabgen$log2FoldChange, sigtabgen$Genus, function(x) max(x))
x = sort(x, TRUE)
sigtabgen$Genus = factor(as.character(sigtabgen$Genus), levels=names(x))
sigtabgen = subset(sigtabgen, padj < 0.05)
fig1a=ggplot(sigtabgen, aes(y=Genus, x=log2FoldChange, color=Phylum)) +
geom_vline(xintercept = 0.0, color = "gray", size = 0.5) +
geom_point(size=6) +
theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))
print(fig1a)
#look at all the data as a function of tooth class
fig1b=plot_ordination(indexRA, index.pcoa, type="samples", color="Habitat_Class") +
geom_point(size=2, alpha=0.1) + facet_wrap(~Tooth_Aspect)+ coord_fixed(sqrt(evals[2] / evals[1]))+
scale_color_discrete(name="Habitat Class",
breaks=c("Supra", "Sub"),
labels=c("Supragingival Plaque", "Subgingival Plaque"))
fig1c=plot_ordination(indexRA, index.pcoa, type="samples", color="Aim") + geom_point(size=2, alpha=0.1) +
theme_bw() + facet_wrap(~Tooth_Aspect)+coord_fixed(sqrt(evals[2] / evals[1]))+
scale_color_discrete(name="Health Condition",
breaks=c("SA1", "SA3"),
labels=c("Control", "Sjogren's"))
fig1d=plot_ordination(indexRA, index.pcoa, type="samples", color="Subject") + geom_point(size=2, alpha=0.1) +
theme_bw() + facet_wrap(~Tooth_Aspect)+coord_fixed(sqrt(evals[2] / evals[1]))+
scale_color_discrete(name="Subject",
breaks=c("1-101", "1-102", "1-103", "1-104", "1-105", "1-106", "3-301", "3-302", "3-303"),
labels=c("Control 1", "Control 2", "Control 3", "Control 4", "Control 5", "Control 6",
"Sjogren's 1", "Sjogren's 2", "Sjogren's 3"))
#plot the taxa
p = plot_ordination(indexRA, index.pcoa, "taxa")
df = data.frame(p$data, tax_table(indexRA), taxa_sums(index25))
colnames(df)[17] = "sum"
rownames(df) = NULL
df$Highest = paste0(df$Genus, " ", df$Species)
df$Highest = str_replace_all(df$Highest, "NA", "")
fig1e=ggplot(df, aes(Axis.1, Axis.2, label=Highest, color=Phylum))+coord_fixed(sqrt(evals[2] / evals[1]))+
xlab("Axis 1 [24.1%]") + ylab("Axis 2 [10.8%]") + geom_text_repel(aes(Axis.1, Axis.2, label = Highest), size = 3)
grid.arrange(fig1b, fig1e, ncol=2)
ggsave(grid.arrange(fig1b, fig1e, ncol=1), file="~/Desktop/Figure1.pdf", width = 12, height = 10, units ="in",dpi = 300)
Now let’s quantify the segregation of samples using an adonis on Bray Curtis.
library(vegan)
## Warning: package 'vegan' was built under R version 3.4.4
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-2
otus = data.frame(otu_table(index25))
otus.h = decostand(otus, "hell")
attach(sample_data(index25))
## The following object is masked _by_ .GlobalEnv:
##
## x
set.seed(12)
index.adonis <- adonis(otus.h ~Habitat_Class*Aim*Tooth_Class*Tooth_Aspect, permutations=1000, strata = Subject)
df <- format(data.frame(index.adonis$aov.tab), digits=2)
df
## Df SumsOfSqs MeanSqs F.Model
## Habitat_Class 1 21.60 21.60 134.34
## Aim 1 9.07 9.07 56.44
## Tooth_Class 1 1.05 1.05 6.51
## Tooth_Aspect 1 1.08 1.08 6.73
## Habitat_Class:Aim 1 2.12 2.12 13.16
## Habitat_Class:Tooth_Class 1 0.34 0.34 2.13
## Aim:Tooth_Class 1 0.53 0.53 3.29
## Habitat_Class:Tooth_Aspect 1 0.44 0.44 2.71
## Aim:Tooth_Aspect 1 0.26 0.26 1.64
## Tooth_Class:Tooth_Aspect 1 0.41 0.41 2.54
## Habitat_Class:Aim:Tooth_Class 1 0.17 0.17 1.09
## Habitat_Class:Aim:Tooth_Aspect 1 0.15 0.15 0.93
## Habitat_Class:Tooth_Class:Tooth_Aspect 1 0.27 0.27 1.67
## Aim:Tooth_Class:Tooth_Aspect 1 0.19 0.19 1.17
## Habitat_Class:Aim:Tooth_Class:Tooth_Aspect 1 0.22 0.22 1.36
## Residuals 263 42.28 0.16 NA
## Total 278 80.18 NA NA
## R2 Pr..F.
## Habitat_Class 0.2694 0.001
## Aim 0.1132 0.001
## Tooth_Class 0.0130 0.001
## Tooth_Aspect 0.0135 0.001
## Habitat_Class:Aim 0.0264 0.001
## Habitat_Class:Tooth_Class 0.0043 0.033
## Aim:Tooth_Class 0.0066 0.005
## Habitat_Class:Tooth_Aspect 0.0054 0.006
## Aim:Tooth_Aspect 0.0033 0.053
## Tooth_Class:Tooth_Aspect 0.0051 0.010
## Habitat_Class:Aim:Tooth_Class 0.0022 0.161
## Habitat_Class:Aim:Tooth_Aspect 0.0019 0.234
## Habitat_Class:Tooth_Class:Tooth_Aspect 0.0034 0.064
## Aim:Tooth_Class:Tooth_Aspect 0.0024 0.120
## Habitat_Class:Aim:Tooth_Class:Tooth_Aspect 0.0027 0.092
## Residuals 0.5273 NA
## Total 1.0000 NA
detach(sample_data(index25))
Figure 2A: We see that communities differ by tooth class at supragingival sites, but not as much by subgingival sites. Interestingly, subgingival diversity appears to be most variability between molar sites within the healthy control group.
#### 1. Compute the third-order orthogonal polynomial terms
subs = levels(sample_data(index_subsites)$Habitat_Class)
otu.list <- vector("list", length(subs))
names(otu.list) <- subs
for(i in 1:length(subs)){
#hellinger transform the data
library(vegan)
x1 = subset_samples(index_subsites, Habitat_Class==subs[[i]])
x1 = prune_taxa(taxa_sums(x1) > 0, x1)
x1 = prune_samples(sample_sums(x1) > 0, x1)
otus = data.frame(otu_table(x1))
otus.h = decostand(otus, "hellinger")
map = sample_data(x1)
map$x = as.numeric(as.character(map$x))
map$y = as.numeric(as.character(map$y))
#center the coordinates
xygrid = cbind(map$x, map$y)
xygrid.c <- scale(xygrid, scale=FALSE)
### Compute the third-order orthogonal polynomial function using centered geographic coordinates
poly.xy3 <- poly(xygrid.c, degree = 3, raw=FALSE) #, coefs=TRUE)
colnames(poly.xy3) <- c("X", "X^2", "X^3", "Y", "XY", "X^2Y", "Y^2", "XY^2", "Y^3")
poly.xy3.df <- data.frame(poly.xy3, map$x, map$y)
library(ade4)
#perform the RDA on the hellinger transformed data; extract the coordinates
rld.pca <- dudi.pca(otus.h , center=TRUE, scale=TRUE, scannf=FALSE, nf=10)
rld.xy3 <- pcaiv(rld.pca, poly.xy3, scannf = FALSE, nf = 6)
rld.xy3.df <- data.frame(rld.xy3$ls, map)
xy3.df <- data.frame(rld.xy3.df, poly.xy3)
# how much of the variance does the trend surface model explain?
rld.xy3.var <- sum(rld.xy3$eig)/sum(rld.pca$eig)*100
rld.xy3.var
#look at the eigenvalues
rld.xy3$eig
#look at the screeplot
screeplot(rld.xy3) #we should look at the first 3 axes
#what is the percent of ewxplained variance
Explainedvariance = rld.xy3$eig/sum(rld.xy3$eig)*100
Explainedvariance
#force x, y to numeric
xy3.df$x <- as.numeric(as.character(xy3.df$x))
xy3.df$y <- as.numeric(as.character(xy3.df$y))
xy3.df$subsetsite = subs[[i]]
otu.list[[i]] = xy3.df
}
##
## Attaching package: 'ade4'
## The following object is masked from 'package:GenomicRanges':
##
## score
## The following object is masked from 'package:IRanges':
##
## score
## The following object is masked from 'package:BiocGenerics':
##
## score
xy3.df = do.call("rbind", otu.list)
### Force variables to order in ggplot
order=1:32
xy3.df$Tooth_Number <- factor(xy3.df$Tooth_Number, as.character(order))
xy3.df$tclass = xy3.df$Tooth_Class
xy3.df$tclass = str_replace_all(xy3.df$tclass, "Incisor_Central", "Incisor")
xy3.df$tclass = str_replace_all(xy3.df$tclass, "Incisor_Lateral", "Incisor")
#plot axis 1
cbPalette <- c("grey76", "Salmon", "#00BFC4", "grey76")
sub.df = subset(xy3.df, Habitat_Class=="Sub")
supra.df = subset(xy3.df, Habitat_Class=="Supra")
fig2a = ggplot(supra.df, aes(Tooth_Number, Axis1, group=Aim)) + theme_bw() + ylab("Axis 1") +
facet_wrap(Habitat_Class~Aim)+ xlab("Tooth") + geom_point() + geom_smooth() + ggtitle("a)")+
scale_x_discrete(breaks = seq(from=0, to=30, by=5), labels=seq(from=0, to=30, by=5))
fig2b = ggplot(sub.df, aes(Tooth_Number, Axis1, group=Aim)) + theme_bw() + ylab("Axis 1") +
facet_wrap(Habitat_Class~Aim)+ xlab("Tooth") + geom_point() + geom_smooth() + ggtitle("b)")+
scale_x_discrete(breaks = seq(from=0, to=30, by=5), labels=seq(from=0, to=30, by=5))
grid.arrange(fig2a, fig2b, ncol=2)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggsave(grid.arrange(fig2a, fig2b, ncol=2), file="~/Desktop/Figure2.pdf", width = 6, height = 3, units ="in",dpi = 300)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#let's clean the data up so that we retain only samples present in an individual mouth at both sub/supra sites
subgingival.phy = subset_samples(decontam.phy, Habitat_Class=="Sub")
foo = sample_data(subgingival.phy)$Habitat
foo = colsplit(foo, "([_])", c("junk", "Habitat"))
sample_data(subgingival.phy)$Habitat.specific = foo$Habitat
#subgingival
P1.1a = subset_samples(subgingival.phy, Subject=="1-101") #20
P1.2a = subset_samples(subgingival.phy, Subject=="1-102") #22
P1.3a = subset_samples(subgingival.phy, Subject=="1-104") #24
P1.4a = subset_samples(subgingival.phy, Subject=="1-105") #24
P1.5a = subset_samples(subgingival.phy, Subject=="1-106" & Replicate=="R00") #19
P1.5a = subset_samples(P1.5a, Habitat !="Sub_09L")
P1.6a = subset_samples(subgingival.phy, Subject=="3-301") #24
P1.7a = subset_samples(subgingival.phy, Subject=="3-302") #24
P1.8a = subset_samples(subgingival.phy, Subject=="3-303") #
statis_subgingival = merge_phyloseq(P1.1a, P1.2a, P1.3a, P1.4a, P1.5a, P1.6a, P1.7a, P1.8a)
statis_subgingival= prune_samples(sample_sums(statis_subgingival) > 0, statis_subgingival)
statis_subgingival
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 625 taxa and 182 samples ]
## sample_data() Sample Data: [ 182 samples by 31 sample variables ]
## tax_table() Taxonomy Table: [ 625 taxa by 10 taxonomic ranks ]
keep = levels(sample_data(subgingival.phy)$Tooth_Number)
supragingival.phy = subset_samples(decontam.phy, Habitat_Class=="Supra")
supragingival.phy=subset_samples(supragingival.phy, Tooth_Number %in% keep)
foo = sample_data(supragingival.phy)$Habitat
foo = colsplit(foo, "([_])", c("junk", "Habitat"))
sample_data(supragingival.phy)$Habitat.specific = foo$Habitat
#supragingival
P1.1b = subset_samples(supragingival.phy, Subject=="1-101" & !(Habitat %in% c("Supra_08B", "Supra_08L" ,"Supra_09B" ,"Supra_09L" )))
P1.2b = subset_samples(supragingival.phy, Subject=="1-102" & !(Habitat %in% c("Supra_27L"))) #25
P1.3b = subset_samples(supragingival.phy, Subject=="1-104") #24
P1.4b = subset_samples(supragingival.phy, Subject=="1-105") #24
P1.5b = subset_samples(supragingival.phy, Subject=="1-106" & !(Habitat %in% c( "Supra_27B", "Supra_27L", "Supra_30B" ,"Supra_30L")))
P1.6b = subset_samples(supragingival.phy, Subject=="3-301") #24
P1.7b = subset_samples(supragingival.phy, Subject=="3-302") #24
P1.8b = subset_samples(supragingival.phy, Subject=="3-303") #& !(Habitat %in% c("Supra_06B" "Supra_06L"
statis_supragingival = merge_phyloseq(P1.1b, P1.2b, P1.3b, P1.4b, P1.5b, P1.6b, P1.7b, P1.8b)
statis_supragingival = subset_samples(statis_supragingival, Replicate !="R04")
statis_supragingival= prune_samples(sample_sums(statis_supragingival) > 0, statis_supragingival)
#make a new phyloseq object
statis = merge_phyloseq(statis_supragingival, statis_subgingival)
#remove taxa present in fewert han 25% of samples
filtergroup = filterfun(kOverA(k=70, A=10)) #k = number of samples; A = abundance
sub.filt = filter_taxa(statis, filtergroup, prune=TRUE)
sub.filt = prune_samples(sample_sums(sub.filt) > 0, sub.filt)
sub.filt
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 83 taxa and 364 samples ]
## sample_data() Sample Data: [ 364 samples by 31 sample variables ]
## tax_table() Taxonomy Table: [ 83 taxa by 10 taxonomic ranks ]
# read in the perio data
s101p = read.csv("~/Dropbox/Hyposalivation_UCSF-StanfordFileSharing/SA1_data/Bopcal_SA1/CSV/REDCap17_ESE_bopcal_SA1_v5.2_20140731_DD.csv")
s102p = read.csv("~/Dropbox/Hyposalivation_UCSF-StanfordFileSharing/SA1_data/Bopcal_SA1/CSV/REDCap18_ESE_bopcal_SA1_v5.2_20140805_DD.csv")
s104p = read.csv("~/Dropbox/Hyposalivation_UCSF-StanfordFileSharing/SA1_data/Bopcal_SA1/CSV/REDCap39_ESE_bopcal_SA1_v5.2_20140925_DD.csv")
s105p = read.csv("~/Dropbox/Hyposalivation_UCSF-StanfordFileSharing/SA1_data/Bopcal_SA1/CSV/REDCap49_ESE_bopcal_SA1_v5.2_20141021_DD.csv")
s106p = read.csv("~/Dropbox/Hyposalivation_UCSF-StanfordFileSharing/SA1_data/Bopcal_SA1/CSV/REDCap63_ESE_bopcal_SA1_v5.2_20141104_DD.csv")
s301p = read.csv("~/Dropbox/Hyposalivation_UCSF-StanfordFileSharing/SA3_data/Bopcal_SA3/CSV/REDCap66_ESE_bopcal_SA3_v5.2_20141120_DD.csv")
s302p = read.csv("~/Dropbox/Hyposalivation_UCSF-StanfordFileSharing/SA3_data/Bopcal_SA3/CSV/REDCap178_ESE_bopcal_SA3_v5.2_20150528_DD.csv")
s303p = read.csv("~/Dropbox/Hyposalivation_UCSF-StanfordFileSharing/SA3_data/Bopcal_SA3/CSV/REDCap219_ESE_bopcal_SA3_v5.2_20150818_SK.csv")
perio.df = data.frame(rbind(s101p, s102p, s104p, s105p, s106p, s301p, s302p, s303p))
perio.df = subset(perio.df, pd !="Missing")
perio.df = subset(perio.df, tooth !="NA")
pd = doBy::summaryBy(pd~redcap+tooth, data=perio.df, FUN=mean)
colnames(pd) = c("redcap", "Tooth_Number", "PD")
cal = doBy::summaryBy(cal~redcap+tooth, data=perio.df, FUN=mean)
colnames(cal) = c("redcap", "Tooth_Number", "CAL")
bop = doBy::summaryBy(percent.bop~redcap+tooth, data=perio.df, FUN=mean)
colnames(bop) = c("redcap", "Tooth_Number", "BOP")
#read in the caries data #
s101c = read.csv("~/Dropbox/Hyposalivation_UCSF-StanfordFileSharing/SA1_data/Axium_Data_SA1/CSV/REDCap17_ESE_Axium_SA1_v1.2_20140731_DD.csv")
s102c = read.csv("~/Dropbox/Hyposalivation_UCSF-StanfordFileSharing/SA1_data/Axium_Data_SA1/CSV/REDCap18_ESE_Axium_SA1_v1.2_20140819_DP.csv")
s104c = read.csv("~/Dropbox/Hyposalivation_UCSF-StanfordFileSharing/SA1_data/Axium_Data_SA1/CSV/REDCap39_ESE_Axium_SA1_v1.2_20140925_DD.csv")
s105c = read.csv("~/Dropbox/Hyposalivation_UCSF-StanfordFileSharing/SA1_data/Axium_Data_SA1/CSV/REDCap49_ESE_Axium_SA1_v1.2_20141021_DD.csv")
s106c = read.csv("~/Dropbox/Hyposalivation_UCSF-StanfordFileSharing/SA1_data/Axium_Data_SA1/CSV/REDCap63_ESE_Axium_SA1_v1.2_20141104_DD.csv")
s301c = read.csv("~/Dropbox/Hyposalivation_UCSF-StanfordFileSharing/SA3_data/Axium_Data_SA3/CSV/REDCap66_ESE_Axium_SA3_v1.2_20141120_DD.csv")
s302c = read.csv("~/Dropbox/Hyposalivation_UCSF-StanfordFileSharing/SA3_data/Axium_Data_SA3/CSV/REDCap178_ESE_Axium_SA3_v1.2_20150528_DD.csv")
s303c = read.csv("~/Dropbox/Hyposalivation_UCSF-StanfordFileSharing/SA3_data/Axium_Data_SA3/CSV/REDCap219_ESE_Axium_SA3_v1.2_20150818_SK.csv")
caries.df = data.frame(rbind(s101c, s102c, s104c, s105c, s106c, s301c, s302c, s303c))
caries.df = subset(caries.df, description !="MISSING")
caries.df = subset(caries.df, tooth !="NA")
caries.df$binary = ifelse(caries.df$description =="SOUND", 0, 1)
dmfs = summaryBy(binary~redcap + tooth, data=caries.df, FUN=sum)
colnames(dmfs) =c("redcap", "Tooth_Number", "dmfs")
map = data.frame(sample_data(sub.filt))
pids = read.csv("~/Desktop/pids.csv")
colnames(pids) = c("redcap", "Subject")
flow = read.csv("~/Dropbox/Periodontology2000/flowrates.csv")
new.map = join(map, pids, "Subject")
new.map = join(new.map, pd)
## Joining by: Tooth_Number, redcap
new.map = join(new.map, cal)
## Joining by: Tooth_Number, redcap
new.map = join(new.map, bop)
## Joining by: Tooth_Number, redcap
new.map = join(new.map, flow)
## Joining by: Subject, redcap
new.map = join(new.map, dmfs)
## Joining by: Tooth_Number, redcap
rownames(new.map) = new.map$X.SampleID
sample_data(sub.filt) = sample_data(new.map)
#set an analysis up for CCA
otus = otu_table(sub.filt)
otus = decostand(otus, "hell")
map1 = data.frame(sample_data(sub.filt)$Habitat_Class, sample_data(sub.filt)$UWS_FR, sample_data(sub.filt)$CAL,
sample_data(sub.filt)$PD, sample_data(sub.filt)$BOP, sample_data(sub.filt)$dmfs)
colnames(map1) = c("Class", "uws_fr", "cal", "pd", "bop", "dmfs")
map1$Class = as.numeric(map1$Class)
subs = as.vector(sample_data(sub.filt)$Subject)
runs = as.vector(sample_data(sub.filt)$Pool_Name)
#for some reason this wworks on the command line but not in knitr
attach(map1)
## The following objects are masked _by_ .GlobalEnv:
##
## bop, cal, dmfs, pd
flow.cca = vegan::cca(otus ~ map1$Class* map1$uws_fr* map1$cal* map1$pd* map1$bop* map1$dmfs, sitenv=map1)
#run an anova on this
set.seed(100)
flow.aov = anova.cca(flow.cca, by="term", strata=subs)
flow.aov
## Permutation test for cca under reduced model
## Terms added sequentially (first to last)
## Blocks: strata
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = otus ~ map1$Class * map1$uws_fr * map1$cal * map1$pd * map1$bop * map1$dmfs, sitenv = map1)
## Df ChiSquare F
## map1$Class 1 0.51056 96.6991
## map1$uws_fr 1 0.22654 42.9072
## map1$cal 1 0.05838 11.0570
## map1$pd 1 0.05290 10.0198
## map1$bop 1 0.12729 24.1085
## map1$dmfs 1 0.04008 7.5918
## map1$Class:map1$uws_fr 1 0.10542 19.9672
## map1$Class:map1$cal 1 0.02503 4.7406
## map1$uws_fr:map1$cal 1 0.02665 5.0472
## map1$Class:map1$pd 1 0.02914 5.5197
## map1$uws_fr:map1$pd 1 0.03257 6.1691
## map1$cal:map1$pd 1 0.02653 5.0252
## map1$Class:map1$bop 1 0.04917 9.3122
## map1$uws_fr:map1$bop 1 0.09407 17.8159
## map1$cal:map1$bop 1 0.02315 4.3844
## map1$pd:map1$bop 1 0.01877 3.5544
## map1$Class:map1$dmfs 1 0.01971 3.7329
## map1$uws_fr:map1$dmfs 1 0.01838 3.4819
## map1$cal:map1$dmfs 1 0.01418 2.6848
## map1$pd:map1$dmfs 1 0.01034 1.9583
## map1$bop:map1$dmfs 1 0.00816 1.5460
## map1$Class:map1$uws_fr:map1$cal 1 0.01904 3.6064
## map1$Class:map1$uws_fr:map1$pd 1 0.01876 3.5541
## map1$Class:map1$cal:map1$pd 1 0.01659 3.1415
## map1$uws_fr:map1$cal:map1$pd 1 0.01164 2.2045
## map1$Class:map1$uws_fr:map1$bop 1 0.03583 6.7868
## map1$Class:map1$cal:map1$bop 1 0.00936 1.7727
## map1$uws_fr:map1$cal:map1$bop 1 0.01952 3.6975
## map1$Class:map1$pd:map1$bop 1 0.00893 1.6919
## map1$uws_fr:map1$pd:map1$bop 1 0.01250 2.3677
## map1$cal:map1$pd:map1$bop 1 0.01523 2.8851
## map1$Class:map1$uws_fr:map1$dmfs 1 0.01288 2.4388
## map1$Class:map1$cal:map1$dmfs 1 0.00664 1.2585
## map1$uws_fr:map1$cal:map1$dmfs 1 0.00875 1.6575
## map1$Class:map1$pd:map1$dmfs 1 0.00621 1.1764
## map1$uws_fr:map1$pd:map1$dmfs 1 0.00591 1.1185
## map1$cal:map1$pd:map1$dmfs 1 0.00473 0.8951
## map1$Class:map1$bop:map1$dmfs 1 0.00576 1.0913
## map1$uws_fr:map1$bop:map1$dmfs 1 0.00698 1.3214
## map1$cal:map1$bop:map1$dmfs 1 0.00984 1.8640
## map1$pd:map1$bop:map1$dmfs 1 0.00577 1.0925
## map1$Class:map1$uws_fr:map1$cal:map1$pd 1 0.00943 1.7860
## map1$Class:map1$uws_fr:map1$cal:map1$bop 1 0.01183 2.2399
## map1$Class:map1$uws_fr:map1$pd:map1$bop 1 0.00628 1.1893
## map1$Class:map1$cal:map1$pd:map1$bop 1 0.01003 1.8995
## map1$uws_fr:map1$cal:map1$pd:map1$bop 1 0.00642 1.2155
## map1$Class:map1$uws_fr:map1$cal:map1$dmfs 1 0.00685 1.2979
## map1$Class:map1$uws_fr:map1$pd:map1$dmfs 1 0.00375 0.7110
## map1$Class:map1$cal:map1$pd:map1$dmfs 1 0.00352 0.6663
## map1$uws_fr:map1$cal:map1$pd:map1$dmfs 1 0.00481 0.9104
## map1$Class:map1$uws_fr:map1$bop:map1$dmfs 1 0.00561 1.0628
## map1$Class:map1$cal:map1$bop:map1$dmfs 1 0.00392 0.7419
## map1$uws_fr:map1$cal:map1$bop:map1$dmfs 1 0.00632 1.1971
## map1$Class:map1$pd:map1$bop:map1$dmfs 1 0.00560 1.0601
## map1$uws_fr:map1$pd:map1$bop:map1$dmfs 1 0.00657 1.2434
## map1$cal:map1$pd:map1$bop:map1$dmfs 1 0.01118 2.1174
## map1$Class:map1$uws_fr:map1$cal:map1$pd:map1$bop 1 0.00476 0.9014
## map1$Class:map1$uws_fr:map1$cal:map1$pd:map1$dmfs 1 0.00427 0.8082
## map1$Class:map1$uws_fr:map1$cal:map1$bop:map1$dmfs 1 0.00428 0.8100
## map1$Class:map1$uws_fr:map1$pd:map1$bop:map1$dmfs 1 0.00457 0.8652
## map1$Class:map1$cal:map1$pd:map1$bop:map1$dmfs 1 0.00728 1.3797
## Residual 302 1.59452
## Pr(>F)
## map1$Class 0.001 ***
## map1$uws_fr 0.001 ***
## map1$cal 0.001 ***
## map1$pd 0.001 ***
## map1$bop 0.001 ***
## map1$dmfs 0.004 **
## map1$Class:map1$uws_fr 0.001 ***
## map1$Class:map1$cal 0.001 ***
## map1$uws_fr:map1$cal 0.027 *
## map1$Class:map1$pd 0.001 ***
## map1$uws_fr:map1$pd 0.001 ***
## map1$cal:map1$pd 0.035 *
## map1$Class:map1$bop 0.001 ***
## map1$uws_fr:map1$bop 0.001 ***
## map1$cal:map1$bop 0.144
## map1$pd:map1$bop 0.124
## map1$Class:map1$dmfs 0.001 ***
## map1$uws_fr:map1$dmfs 0.003 **
## map1$cal:map1$dmfs 0.112
## map1$pd:map1$dmfs 0.125
## map1$bop:map1$dmfs 0.344
## map1$Class:map1$uws_fr:map1$cal 0.001 ***
## map1$Class:map1$uws_fr:map1$pd 0.001 ***
## map1$Class:map1$cal:map1$pd 0.003 **
## map1$uws_fr:map1$cal:map1$pd 0.407
## map1$Class:map1$uws_fr:map1$bop 0.001 ***
## map1$Class:map1$cal:map1$bop 0.034 *
## map1$uws_fr:map1$cal:map1$bop 0.018 *
## map1$Class:map1$pd:map1$bop 0.047 *
## map1$uws_fr:map1$pd:map1$bop 0.033 *
## map1$cal:map1$pd:map1$bop 0.023 *
## map1$Class:map1$uws_fr:map1$dmfs 0.005 **
## map1$Class:map1$cal:map1$dmfs 0.129
## map1$uws_fr:map1$cal:map1$dmfs 0.233
## map1$Class:map1$pd:map1$dmfs 0.181
## map1$uws_fr:map1$pd:map1$dmfs 0.328
## map1$cal:map1$pd:map1$dmfs 0.606
## map1$Class:map1$bop:map1$dmfs 0.280
## map1$uws_fr:map1$bop:map1$dmfs 0.301
## map1$cal:map1$bop:map1$dmfs 0.362
## map1$pd:map1$bop:map1$dmfs 0.660
## map1$Class:map1$uws_fr:map1$cal:map1$pd 0.029 *
## map1$Class:map1$uws_fr:map1$cal:map1$bop 0.010 **
## map1$Class:map1$uws_fr:map1$pd:map1$bop 0.176
## map1$Class:map1$cal:map1$pd:map1$bop 0.033 *
## map1$uws_fr:map1$cal:map1$pd:map1$bop 0.241
## map1$Class:map1$uws_fr:map1$cal:map1$dmfs 0.139
## map1$Class:map1$uws_fr:map1$pd:map1$dmfs 0.707
## map1$Class:map1$cal:map1$pd:map1$dmfs 0.720
## map1$uws_fr:map1$cal:map1$pd:map1$dmfs 0.757
## map1$Class:map1$uws_fr:map1$bop:map1$dmfs 0.317
## map1$Class:map1$cal:map1$bop:map1$dmfs 0.613
## map1$uws_fr:map1$cal:map1$bop:map1$dmfs 0.789
## map1$Class:map1$pd:map1$bop:map1$dmfs 0.255
## map1$uws_fr:map1$pd:map1$bop:map1$dmfs 0.551
## map1$cal:map1$pd:map1$bop:map1$dmfs 0.099 .
## map1$Class:map1$uws_fr:map1$cal:map1$pd:map1$bop 0.495
## map1$Class:map1$uws_fr:map1$cal:map1$pd:map1$dmfs 0.646
## map1$Class:map1$uws_fr:map1$cal:map1$bop:map1$dmfs 0.531
## map1$Class:map1$uws_fr:map1$pd:map1$bop:map1$dmfs 0.667
## map1$Class:map1$cal:map1$pd:map1$bop:map1$dmfs 0.126
## Residual
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#how much variance does the model explain
eig = flow.cca$CCA$eig
eigp = 100*(eig/sum(eig))
head(eigp)
## CCA1 CCA2 CCA3 CCA4 CCA5 CCA6
## 27.671209 19.323712 10.865190 8.335889 6.015871 4.889771
#get the sample site scores and plot them
sites = data.frame(flow.cca$CCA$wa, sample_data(sub.filt))
sites$Aim = ifelse(sites$Aim=="SA1", "Control", "Sjogrens")
#convert the MFS score to factor
p1 = ggplot(sites, aes(CCA1, CCA2, color=as.factor(Aim))) + geom_point() + guides(color=guide_legend(title="Health Status")) +
ggtitle("a)") + xlab("CCA1 (27.7%)") + ylab("CCA2 (19.3%)") +
theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1)) +
coord_fixed(sqrt(eigp[2] / eigp[1]))
p2 = ggplot(sites, aes(CCA1, CCA2, color=uws_fr)) + geom_point() + guides(color=guide_legend(title="uws_fr")) +
ggtitle("b)") + xlab("CCA1 (27.7%)") + ylab("CCA2 (19.3%)") +
theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1)) +
coord_fixed(sqrt(eigp[2] / eigp[1]))+ scale_colour_gradientn(colours=c("red", "blue"))
p3 = ggplot(sites, aes(CCA1, CCA2, color=dmfs)) + geom_point() + guides(color=guide_legend(title="dmfs")) +
ggtitle("c)") + xlab("CCA1 (27.7%)") + ylab("CCA2 (19.3%)") +
theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1)) +
coord_fixed(sqrt(eigp[2] / eigp[1]))+ scale_colour_gradientn(colours=c("red", "blue"))
p4 = ggplot(sites, aes(CCA1, CCA2, color=CAL)) + geom_point() + guides(color=guide_legend(title="CAL")) +
ggtitle("d)") + xlab("CCA1 (27.7%)") + ylab("CCA2 (19.3%)") +
theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1)) +
coord_fixed(sqrt(eigp[2] / eigp[1]))+ scale_colour_gradientn(colours=c("red", "blue"))
p5 = ggplot(sites, aes(CCA1, CCA2, color=BOP)) + geom_point() + guides(color=guide_legend(title="BOP")) +
ggtitle("e)") + xlab("CCA1 (27.7%)") + ylab("CCA2 (19.3%)") +
theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1)) +
coord_fixed(sqrt(eigp[2] / eigp[1]))+ scale_colour_gradientn(colours=c("red", "blue"))
p6 = ggplot(sites, aes(CCA1, CCA2, color=PD)) + geom_point() + guides(color=guide_legend(title="PD")) +
ggtitle("f)") + xlab("CCA1 (27.7%)") + ylab("CCA2 (19.3%)") +
theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1)) +
coord_fixed(sqrt(eigp[2] / eigp[1]))+ scale_colour_gradientn(colours=c("red", "blue"))
grid.arrange(p1, p2, p3, p4, p5, p6, ncol=2)
#subset on supragingival
supra = subset_samples(sub.filt, Habitat_Class=="Supra")
supra = prune_taxa(taxa_sums(supra) >1, supra)
#subset on subgingival
sub = subset_samples(sub.filt, Habitat_Class=="Sub")
sub = prune_taxa(taxa_sums(sub) >1, sub)
#transform
Hstatis_subgingival = decostand(data.frame(otu_table(sub)), "hel")
Hstatis_supragingival = decostand(data.frame(otu_table(supra)), "hel")
#generate the model
set.seed(981)
subs = sample_data(supra)$Subject
mod1 = CCorA(Hstatis_subgingival, Hstatis_supragingival, permutations=999)
biplot(mod1, plot.type="ov")
eval = 100*(mod1$Eigenvalues/sum(mod1$Eigenvalues))
#plot subgingival samples
df1 = data.frame(mod1$Cx, sample_data(sub))
p1=ggplot(df1, aes(CanAxis1, CanAxis2, label=Subject, color=Aim)) + geom_text() + ggtitle(("a)")) +
theme_bw()+theme(text = element_text(size=20), axis.text.x = element_text(angle=0, vjust=1)) +
theme(legend.position="none") + xlab("3.1%")+ ylab("3.1%")
#plot supragingival samples
df2 = data.frame(mod1$Cy, sample_data(supra))
p2 = ggplot(df2, aes(CanAxis1, CanAxis2, label=Subject, color=Aim)) + geom_text() + ggtitle("b)") +
theme_bw()+theme(text = element_text(size=20), axis.text.x = element_text(angle=0, vjust=1)) +
theme(legend.position="none") + xlab("3.1%")+ ylab("3.1%")
#subgingival species
df3 = data.frame(mod1$corr.Y.Cy, tax_table(sub))
#write.csv()
myTax3 = df3$Highest.Rank
p3 = ggplot(df3, aes(CanAxis1, CanAxis2, label=rownames(df3), color=Phylum)) +
theme_bw()+theme(text = element_text(size=20), axis.text.x = element_text(angle=0, vjust=1)) +
theme(legend.position="bottom") + geom_text_repel(aes(CanAxis1, CanAxis2, label = myTax3), size = 3)
#supragingival species
df4 = data.frame(mod1$corr.X.Cy, tax_table(supra))
myTax4 = df4$Highest.Rank
p4 = ggplot(df4, aes(CanAxis1, CanAxis2, label=rownames(df4), color=Phylum)) +
theme_bw()+theme(text = element_text(size=20), axis.text.x = element_text(angle=0, vjust=1)) +
theme(legend.position="bottom")+ geom_text_repel(aes(CanAxis1, CanAxis2, label = myTax4), size = 3)
#plot
grid.arrange(p1, p2, p3, p4, ncol=2)
mod1
##
## Canonical Correlation Analysis
##
## Call:
## CCorA(Y = Hstatis_subgingival, X = Hstatis_supragingival, permutations = 999)
##
## Y X
## Matrix Ranks 63 82
##
## Pillai's trace: 32.34643
##
## Significance of Pillai's trace:
## from F-distribution: < 2.22e-16
## based on permutations: 0.001
## Permutation: free
## Number of permutations: 999
##
## CanAxis1 CanAxis2 CanAxis3 CanAxis4 CanAxis5
## Canonical Correlations 0.99582 0.99207 0.98786 0.98271 0.97524
## CanAxis6 CanAxis7 CanAxis8 CanAxis9 CanAxis10
## Canonical Correlations 0.97257 0.96719 0.96234 0.95815 0.95326
## CanAxis11 CanAxis12 CanAxis13 CanAxis14 CanAxis15
## Canonical Correlations 0.94811 0.94029 0.93097 0.91924 0.91284
## CanAxis16 CanAxis17 CanAxis18 CanAxis19 CanAxis20
## Canonical Correlations 0.90369 0.89181 0.88321 0.87142 0.86111
## CanAxis21 CanAxis22 CanAxis23 CanAxis24 CanAxis25
## Canonical Correlations 0.85538 0.84064 0.83844 0.82316 0.80178
## CanAxis26 CanAxis27 CanAxis28 CanAxis29 CanAxis30
## Canonical Correlations 0.79894 0.79665 0.79129 0.76745 0.75951
## CanAxis31 CanAxis32 CanAxis33 CanAxis34 CanAxis35
## Canonical Correlations 0.73956 0.72540 0.72085 0.69162 0.67337
## CanAxis36 CanAxis37 CanAxis38 CanAxis39 CanAxis40
## Canonical Correlations 0.65051 0.64334 0.63501 0.61414 0.59257
## CanAxis41 CanAxis42 CanAxis43 CanAxis44 CanAxis45
## Canonical Correlations 0.57645 0.57584 0.55907 0.54152 0.52546
## CanAxis46 CanAxis47 CanAxis48 CanAxis49 CanAxis50
## Canonical Correlations 0.51112 0.49474 0.47702 0.43050 0.42409
## CanAxis51 CanAxis52 CanAxis53 CanAxis54 CanAxis55
## Canonical Correlations 0.38521 0.37556 0.33694 0.32497 0.29939
## CanAxis56 CanAxis57 CanAxis58 CanAxis59 CanAxis60
## Canonical Correlations 0.27436 0.25889 0.23370 0.21744 0.21106
## CanAxis61 CanAxis62 CanAxis63
## Canonical Correlations 0.16926 0.16713 0.1217
##
## Y | X X | Y
## RDA R squares 0.7303 0.6534
## adj. RDA R squares 0.5069 0.4684
#look at s.mutans
sm = subset_taxa(sub.filt, Highest.Rank=="Streptococcus_mutans")
df= data.frame(otu_table(sm), sample_data(sm))
dfm= melt(df, id.vars = colnames(sample_data(sm)))
ordering= c("3", "6","8","9", "11", "14","19", "22","24","25","27","30")
dfm$Tooth_Number <- factor(dfm$Tooth_Number, levels = ordering)
p=ggplot(dfm,aes(Habitat_Class,value)) + geom_bar(stat="identity") + facet_wrap(~Aim)
p = ggplot(dfm,aes(Tooth_Number,PD, size=value, color=Tooth_Class)) + geom_point() + facet_wrap(Habitat_Class~Subject, ncol=8)
p
#Prevotella_histicola
ph = subset_taxa(sub.filt, Highest.Rank=="Prevotella_histicola")
df= data.frame(otu_table(ph), sample_data(ph))
dfm= melt(df, id.vars = colnames(sample_data(ph)))
ordering= c("3", "6","8","9", "11", "14","19", "22","24","25","27","30")
dfm$Tooth_Number <- factor(dfm$Tooth_Number, levels = ordering)
p=ggplot(dfm,aes(Habitat_Class,value)) + geom_bar(stat="identity") + facet_wrap(~Aim)
p = ggplot(dfm,aes(Tooth_Number,PD, size=value, color=Tooth_Class)) + geom_point() + facet_wrap(Habitat_Class~Subject, ncol=8)
p
#look at Scardovia_wiggsiae
sm = subset_taxa(sub.filt, Highest.Rank=="Scardovia_wiggsiae")
df= data.frame(otu_table(sm), sample_data(sm))
dfm= melt(df, id.vars = colnames(sample_data(sm)))
ordering= c("3", "6","8","9", "11", "14","19", "22","24","25","27","30")
dfm$Tooth_Number <- factor(dfm$Tooth_Number, levels = ordering)
p=ggplot(dfm,aes(Habitat_Class,value)) + geom_bar(stat="identity") + facet_wrap(~Aim)
p = ggplot(dfm,aes(Tooth_Number,PD, size=value, color=Tooth_Class)) + geom_point() + facet_wrap(Habitat_Class~Subject, ncol=8)
p
#look at Streptococcus_sanguinis
sm = subset_taxa(sub.filt, Highest.Rank=="Streptococcus_sanguinis")
df= data.frame(otu_table(sm), sample_data(sm))
dfm= melt(df, id.vars = colnames(sample_data(sm)))
ordering= c("3", "6","8","9", "11", "14","19", "22","24","25","27","30")
dfm$Tooth_Number <- factor(dfm$Tooth_Number, levels = ordering)
p=ggplot(dfm,aes(Habitat_Class,value)) + geom_bar(stat="identity") + facet_wrap(~Aim)
p = ggplot(dfm,aes(Tooth_Number,PD, size=value, color=Tooth_Class)) + geom_point() + facet_wrap(Habitat_Class~Subject, ncol=8)
p
#subset on supragingival
supra = subset_samples(sub.filt, Habitat_Class=="Supra")
supra = subset_samples(supra, Aim =="SA1")
supra = prune_taxa(taxa_sums(supra) >1, supra)
#subset on subgingival
sub = subset_samples(sub.filt, Habitat_Class=="Sub")
sub = subset_samples(sub, Aim=="SA1")
sub = prune_taxa(taxa_sums(sub) >1, sub)
#transform
Hstatis_subgingival = decostand(data.frame(otu_table(sub)), "hel")
Hstatis_supragingival = decostand(data.frame(otu_table(supra)), "hel")
eval = 100*(mod1$Eigenvalues/sum(mod1$Eigenvalues))
#generate the model
set.seed(981)
subs = sample_data(supra)$Subject
mod1 = CCorA(Hstatis_subgingival, Hstatis_supragingival, permutations=999)
biplot(mod1, plot.type="ov")
#plot subgingival samples
df1 = data.frame(mod1$Cx, sample_data(sub))
p1=ggplot(df1, aes(CanAxis1, CanAxis2, label=Subject, color=Subject)) + geom_text() + ggtitle(("a)")) +
theme_bw()+theme(text = element_text(size=20), axis.text.x = element_text(angle=0, vjust=1)) +
theme(legend.position="none") + xlab("3.1%")+ ylab("3.1%")
#plot supragingival samples
df2 = data.frame(mod1$Cy, sample_data(supra))
p2 = ggplot(df2, aes(CanAxis1, CanAxis2, label=Subject, color=Subject)) + geom_text() + ggtitle("b)") +
theme_bw()+theme(text = element_text(size=20), axis.text.x = element_text(angle=0, vjust=1)) +
theme(legend.position="none") + xlab("3.1%")+ ylab("3.1%")
#subgingival species
df3 = data.frame(mod1$corr.Y.Cy, tax_table(sub))
#write.csv()
myTax3 = df3$Highest.Rank
p3 = ggplot(df3, aes(CanAxis1, CanAxis2, label=rownames(df3), color=Phylum)) +
theme_bw()+theme(text = element_text(size=20), axis.text.x = element_text(angle=0, vjust=1)) +
theme(legend.position="bottom") + geom_text_repel(aes(CanAxis1, CanAxis2, label = myTax3), size = 3)
#supragingival species
df4 = data.frame(mod1$corr.X.Cy, tax_table(supra))
myTax4 = df4$Highest.Rank
p4 = ggplot(df4, aes(CanAxis1, CanAxis2, label=rownames(df4), color=Phylum)) +
theme_bw()+theme(text = element_text(size=20), axis.text.x = element_text(angle=0, vjust=1)) +
theme(legend.position="bottom")+ geom_text_repel(aes(CanAxis1, CanAxis2, label = myTax4), size = 3)
#plot
grid.arrange(p1, p2, p3, p4, ncol=2)
mod1
##
## Canonical Correlation Analysis
##
## Call:
## CCorA(Y = Hstatis_subgingival, X = Hstatis_supragingival, permutations = 999)
##
## Y X
## Matrix Ranks 63 82
##
## Pillai's trace: 49.00043
##
## Significance of Pillai's trace:
## from F-distribution: 0.00020046
## based on permutations: 0.001
## Permutation: free
## Number of permutations: 999
##
## CanAxis1 CanAxis2 CanAxis3 CanAxis4 CanAxis5
## Canonical Correlations 1.00000 1.00000 1.00000 1.00000 1.00000
## CanAxis6 CanAxis7 CanAxis8 CanAxis9 CanAxis10
## Canonical Correlations 1.00000 1.00000 1.00000 1.00000 1.00000
## CanAxis11 CanAxis12 CanAxis13 CanAxis14 CanAxis15
## Canonical Correlations 1.00000 1.00000 1.00000 1.00000 1.00000
## CanAxis16 CanAxis17 CanAxis18 CanAxis19 CanAxis20
## Canonical Correlations 1.00000 1.00000 1.00000 1.00000 1.00000
## CanAxis21 CanAxis22 CanAxis23 CanAxis24 CanAxis25
## Canonical Correlations 1.00000 1.00000 1.00000 1.00000 1.00000
## CanAxis26 CanAxis27 CanAxis28 CanAxis29 CanAxis30
## Canonical Correlations 1.00000 1.00000 1.00000 1.00000 1.00000
## CanAxis31 CanAxis32 CanAxis33 CanAxis34 CanAxis35
## Canonical Correlations 1.00000 1.00000 1.00000 1.00000 1.00000
## CanAxis36 CanAxis37 CanAxis38 CanAxis39 CanAxis40
## Canonical Correlations 1.00000 0.95843 0.94227 0.92481 0.90742
## CanAxis41 CanAxis42 CanAxis43 CanAxis44 CanAxis45
## Canonical Correlations 0.88583 0.86619 0.85130 0.83236 0.81504
## CanAxis46 CanAxis47 CanAxis48 CanAxis49 CanAxis50
## Canonical Correlations 0.79023 0.75572 0.74428 0.70769 0.67561
## CanAxis51 CanAxis52 CanAxis53 CanAxis54 CanAxis55
## Canonical Correlations 0.65663 0.64684 0.60128 0.58696 0.55685
## CanAxis56 CanAxis57 CanAxis58 CanAxis59 CanAxis60
## Canonical Correlations 0.52476 0.50010 0.43905 0.39938 0.38251
## CanAxis61 CanAxis62 CanAxis63
## Canonical Correlations 0.35834 0.32460 0.2611
##
## Y | X X | Y
## RDA R squares 0.84767 0.7117
## adj. RDA R squares 0.38504 0.3169
#look at s.mutans
sm = subset_taxa(sub.filt, Highest.Rank=="Streptococcus_mutans")
df= data.frame(otu_table(sm), sample_data(sm))
dfm= melt(df, id.vars = colnames(sample_data(sm)))
ordering= c("3", "6","8","9", "11", "14","19", "22","24","25","27","30")
dfm$Tooth_Number <- factor(dfm$Tooth_Number, levels = ordering)
p=ggplot(dfm,aes(Habitat_Class,value)) + geom_bar(stat="identity") + facet_wrap(~Aim)
p = ggplot(dfm,aes(Tooth_Number,PD, size=value, color=Tooth_Class)) + geom_point() + facet_wrap(Habitat_Class~Subject, ncol=8)
#
ph = subset_taxa(sub.filt, Highest.Rank=="Prevotella_histicola")
df= data.frame(otu_table(ph), sample_data(ph))
dfm= melt(df, id.vars = colnames(sample_data(ph)))
ordering= c("3", "6","8","9", "11", "14","19", "22","24","25","27","30")
dfm$Tooth_Number <- factor(dfm$Tooth_Number, levels = ordering)
p=ggplot(dfm,aes(Habitat_Class,value)) + geom_bar(stat="identity") + facet_wrap(~Aim)
p
p = ggplot(dfm,aes(Tooth_Number,PD, size=value, color=Tooth_Class)) + geom_point() + facet_wrap(Habitat_Class~Subject, ncol=8)
p